package reptile

import (
	"crawler_translation/data"
	"crawler_translation/translate"
	"crawler_translation/utils"
	"errors"
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"go.uber.org/zap"
	"io"
	"log"
	"net/http"
	"strings"
	"sync"
)

/**
爬虫模块
*/

type Reptile struct {
	// 爬取url
	reptileUrlMaps []data.Menu
	// 读写锁
	mu *sync.RWMutex
	// 开启线程数量
	goroutineNumber int32
	// jQuery风格获取dom
	query string
}

func NewReptile(menus []data.Menu, goroutineNumber int32, query string) *Reptile {
	return &Reptile{
		reptileUrlMaps:  menus,
		mu:              &sync.RWMutex{},
		goroutineNumber: goroutineNumber,
		query:           query,
	}
}

func (r *Reptile) Run() error {
	// TODO 开启线程池
	// 暂时使用单线程

	if len(r.reptileUrlMaps) == 0 {
		return errors.New("reptileUrlMaps is empty")
	}
	for _, value := range r.reptileUrlMaps {
		url := value.Domain + value.Path
		utils.Logger.Info("Start crawling...", zap.String("url", url))
		err := r.handleFun(value)
		if err != nil {
			utils.Logger.Error("Error crawling...", zap.String("url", url), zap.Error(err))
			continue
		}
		utils.Logger.Info("Success crawling...", zap.String("url", url))
	}
	return nil
}

// 爬取数据处理
func (r *Reptile) handleFun(menu data.Menu) error {
	// 爬取网页内容
	pageBody, err := r.crawlingWebPage(menu.Domain + menu.Path)
	if err != nil {
		return err
	}

	// 获取指定html中的内容
	content, err := r.getBodyByClass(pageBody, func(doc *goquery.Document) (data string, err error) {
		data, err = doc.Find(r.query).Html()
		if err != nil {
			return "", fmt.Errorf("获取getBodyClass [.theme-doc-markdown] err = %s", err)
		}
		return
	})
	if err != nil {
		return err
	}

	hash := utils.MD5(content)
	if hash == menu.Hash {
		utils.Logger.Info("hash对比相同，跳过...", zap.String("url", menu.Path))
		return nil
	}

	// 更新hash
	if err = data.MConn.UpdateMenu(hash, menu.Id); err != nil {
		return err
	}

	// 翻译
	translateIMR := translate.NewTranslate()
	contentZh, err := translateIMR.HtmlToEn(content)
	if err != nil {
		return err
	}

	// 获取title
	title, _ := r.getBodyByClass(pageBody, func(doc *goquery.Document) (data string, err error) {
		title := doc.Find("title").Text()
		return title, nil
	})
	if len(title) > 0 {
		title, err = translateIMR.TranslateEn2Ch(title)
	}

	// 获取description
	description, err := r.getBodyByClass(pageBody, func(doc *goquery.Document) (data string, err error) {
		description, exit := doc.Find("[name='description']").Attr("content")
		if !exit {
			return "", fmt.Errorf("不存在 description")
		}
		return description, nil
	})
	if err == nil {
		description, err = translateIMR.TranslateEn2Ch(description)
	}

	// 存储
	if err = data.MConn.StoreDocs(data.Docs{
		Id:          menu.Id,
		Content:     contentZh,
		Path:        menu.Path,
		Title:       title,
		Description: description,
	}, menu.Id); err != nil {
		return err
	}

	return nil
}

// 爬取网页内容
func (r *Reptile) crawlingWebPage(url string) (content string, err error) {
	res, err := http.Get(url)
	if err != nil {
		log.Fatal(err)
	}
	defer res.Body.Close()
	if res.StatusCode != 200 {
		return "", errors.New(fmt.Sprintf("status code error: %d %s", res.StatusCode, res.Status))
	}

	pageBytes, err := io.ReadAll(res.Body)
	if err != nil {
		return "", err
	}

	content = string(pageBytes)

	return
}

// 读取指定html中的内容
func (r *Reptile) getBodyByClass(pageContent string, f func(doc *goquery.Document) (data string, err error)) (data string, err error) {
	p, err := goquery.NewDocumentFromReader(strings.NewReader(pageContent))
	if err != nil {
		return
	}

	data, err = f(p)

	return
}
